From d4282ab308d880580089973ca9a8ab804edd7e22 Mon Sep 17 00:00:00 2001 From: "akw27@arcadians.cl.cam.ac.uk" Date: Mon, 21 Mar 2005 19:57:02 +0000 Subject: [PATCH] bitkeeper revision 1.1236.38.1 (423f270ey18R1fJMKT6mo5gO0HvPiQ) Incremental fixes to plx stuff. Signed-off-by: andrew.warfield@cl.cam.ac.uk --- .rootkeys | 4 + tools/blktap/Makefile | 33 +- tools/blktap/blktaplib.c | 24 +- tools/blktap/blockstore-tls.c | 161 ++++++++ tools/blktap/blockstore.c | 218 ++++++++++- tools/blktap/blockstore.h | 26 ++ tools/blktap/parallax-threaded.c | 654 +++++++++++++++++++++++++++++++ tools/blktap/parallax-threaded.h | 23 ++ tools/blktap/parallax.c | 6 +- tools/blktap/radix.c | 125 ++++-- tools/blktap/radix.h | 1 + tools/blktap/snaplog.c | 64 +++ tools/blktap/snaplog.h | 11 +- tools/blktap/vdi.c | 5 +- tools/blktap/vdi_snap_delete.c | 48 +++ tools/blktap/vdi_snap_list.c | 13 +- 16 files changed, 1356 insertions(+), 60 deletions(-) create mode 100644 tools/blktap/blockstore-tls.c create mode 100644 tools/blktap/parallax-threaded.c create mode 100644 tools/blktap/parallax-threaded.h create mode 100644 tools/blktap/vdi_snap_delete.c diff --git a/.rootkeys b/.rootkeys index ddfd684dfa..abb816b8fe 100644 --- a/.rootkeys +++ b/.rootkeys @@ -339,12 +339,15 @@ 42090340_mvZtozMjghPJO0qsjk4NQ tools/blktap/blkint.h 42090340rc2q1wmlGn6HtiJAkqhtNQ tools/blktap/blktaplib.c 42090340C-WkRPT7N3t-8Lzehzogdw tools/blktap/blktaplib.h +423f270cAbkh2f-DHtT0hmCtFFXVXg tools/blktap/blockstore-tls.c 42277b02WrfP1meTDPv1M5swFq8oHQ tools/blktap/blockstore.c 42277b02P1C0FYj3gqwTZUD8sxKCug tools/blktap/blockstore.h 42090340B3mDvcxvd9ehDHUkg46hvw tools/blktap/libgnbd/Makefile 42090340ZWkc5Xhf9lpQmDON8HJXww tools/blktap/libgnbd/gnbdtest.c 42090340ocMiUScJE3OpY7QNunvSbg tools/blktap/libgnbd/libgnbd.c 42090340G5_F_EeVnPORKB0pTMGGhA tools/blktap/libgnbd/libgnbd.h +423f270cbEKiTMapKnCyqkuwGvgOMA tools/blktap/parallax-threaded.c +423f270cFdXryIcD7HTPUl_Dbk4DAQ tools/blktap/parallax-threaded.h 42277b03930x2TJT3PZlw6o0GERXpw tools/blktap/parallax.c 42277b03XQYq8bujXSz7JAZ8N7j_pA tools/blktap/radix.c 42277b03vZ4-jno_mgKmAcCW3ycRAg tools/blktap/radix.h @@ -356,6 +359,7 @@ 42277b04xB_iUmiSm6nKcy8OV8bckA tools/blktap/vdi_fill.c 42277b045CJGD_rKH-ZT_-0X4knhWA tools/blktap/vdi_list.c 42277b043ZKx0NJSbcgptQctQ5rerg tools/blktap/vdi_snap.c +423f270c_QDjGLQ_YdaOtyBM5n9BDg tools/blktap/vdi_snap_delete.c 42277b043Fjy5-H7LyBtUPyDlZFo6A tools/blktap/vdi_snap_list.c 42277b04vhqD6Lq3WmGbaESoAAKdhw tools/blktap/vdi_tree.c 42277b047H8fTVyUf75BWAjh6Zpsqg tools/blktap/vdi_validate.c diff --git a/tools/blktap/Makefile b/tools/blktap/Makefile index 50d77b905b..2699563f6d 100644 --- a/tools/blktap/Makefile +++ b/tools/blktap/Makefile @@ -21,8 +21,12 @@ SRCS += blktaplib.c PLX_SRCS := PLX_SRCS += vdi.c PLX_SRCS += radix.c -PLX_SRCS += blockstore.c PLX_SRCS += snaplog.c +PLXT_SRCS := $(PLX_SRCS) +#PLXT_SRCS += blockstore-tls.c +PLXT_SRCS += blockstore.c +PLXT_SRCS += parallax-threaded.c +PLX_SRCS += blockstore.c VDI_SRCS := $(PLX_SRCS) PLX_SRCS += parallax.c @@ -31,6 +35,7 @@ VDI_TOOLS += vdi_create VDI_TOOLS += vdi_list VDI_TOOLS += vdi_snap VDI_TOOLS += vdi_snap_list +VDI_TOOLS += vdi_snap_delete VDI_TOOLS += vdi_fill VDI_TOOLS += vdi_tree VDI_TOOLS += vdi_validate @@ -91,7 +96,7 @@ libblktap.so: libblktap.so.$(MAJOR): ln -sf libblktap.so.$(MAJOR).$(MINOR) $@ libblktap.so.$(MAJOR).$(MINOR): $(OBJS) - $(CC) -Wl,-soname -Wl,$(SONAME) -shared -o $@ $^ -L../libxutil -lxutil -lz + $(CC) -Wl,-soname -Wl,$(SONAME) -shared -o $@ $^ -lpthread -L../libxutil -lxutil -lz blkdump: $(LIB) $(CC) $(CFLAGS) -o blkdump -L$(XEN_LIBXC) -L$(XEN_LIBXUTIL) -L. -l blktap blkdump.c @@ -117,6 +122,9 @@ blkaio: $(LIB) blkaio.c blkaiolib.c parallax: $(LIB) $(PLX_SRCS) $(CC) $(CFLAGS) -o parallax -L$(XEN_LIBXC) -L$(XEN_LIBXUTIL) -L. -lblktap $(PLX_SRCS) libgnbd/libgnbd.a +parallax-threaded: $(LIB) $(PLXT_SRCS) + $(CC) $(CFLAGS) -o parallax-threaded -L$(XEN_LIBXC) -L$(XEN_LIBXUTIL) -L. -lpthread -lblktap $(PLXT_SRCS) libgnbd/libgnbd.a + vdi_test: $(LIB) $(VDI_SRCS) $(CC) $(CFLAGS) -g3 -o vdi_test -DVDI_STANDALONE $(VDI_SRCS) @@ -132,6 +140,9 @@ vdi_snap: $(LIB) vdi_snap.c $(VDI_SRCS) vdi_snap_list: $(LIB) vdi_snap_list.c $(VDI_SRCS) $(CC) $(CFLAGS) -g3 -o vdi_snap_list vdi_snap_list.c $(VDI_SRCS) +vdi_snap_delete: $(LIB) vdi_snap_delete.c $(VDI_SRCS) + $(CC) $(CFLAGS) -g3 -o vdi_snap_delete vdi_snap_delete.c $(VDI_SRCS) + vdi_tree: $(LIB) vdi_tree.c $(VDI_SRCS) $(CC) $(CFLAGS) -g3 -o vdi_tree vdi_tree.c $(VDI_SRCS) @@ -142,12 +153,22 @@ vdi_validate: $(LIB) vdi_validate.c $(VDI_SRCS) $(CC) $(CFLAGS) -g3 -o vdi_validate vdi_validate.c $(VDI_SRCS) -rdx_cmp: $(LIB) rdx_cmp.c $(VDI_SRCS) - $(CC) $(CFLAGS) -g3 -o rdx_cmp rdx_cmp.c $(VDI_SRCS) - - .PHONY: TAGS clean install mk-symlinks rpm TAGS: etags -t $(SRCS) *.h -include $(DEPS) + +#Random testing targets. To be removed eventually. + +rdx_cmp: $(LIB) rdx_cmp.c $(VDI_SRCS) + $(CC) $(CFLAGS) -g3 -o rdx_cmp rdx_cmp.c $(VDI_SRCS) + +bb-tls: $(LIB) blockstore-benchmark.c + $(CC) $(CFLAGS) -o bb-tls blockstore-benchmark.c blockstore-tls.c -lpthread + +bb-trans: $(LIB) blockstore-benchmark.c + $(CC) $(CFLAGS) -o bb-trans blockstore-benchmark.c blockstore.c -lpthread + +radix-test: $(LIB) radix.c blockstore-threaded-trans.c + $(CC) $(CFLAGS) -g3 -D RADIX_STANDALONE -o radix-test radix.c blockstore-threaded-trans.c diff --git a/tools/blktap/blktaplib.c b/tools/blktap/blktaplib.c index a50eaa909b..35b893f677 100644 --- a/tools/blktap/blktaplib.c +++ b/tools/blktap/blktaplib.c @@ -3,6 +3,8 @@ * * userspace interface routines for the blktap driver. * + * (threadsafe(r) version) + * * (c) 2004 Andrew Warfield. */ @@ -21,11 +23,13 @@ #include #include #include +#include + #define __COMPILING_BLKTAP_LIB #include "blktaplib.h" -#if 1 +#if 0 #define DPRINTF(_f, _a...) printf ( _f , ## _a ) #else #define DPRINTF(_f, _a...) ((void)0) @@ -194,15 +198,19 @@ void print_hooks(void) /*-----[ Data to/from Backend (server) VM ]------------------------------*/ + + inline int write_req_to_be_ring(blkif_request_t *req) { blkif_request_t *req_d; + static pthread_mutex_t be_prod_mutex = PTHREAD_MUTEX_INITIALIZER; - //req_d = FRONT_RING_NEXT_EMPTY_REQUEST(&be_ring); + pthread_mutex_lock(&be_prod_mutex); req_d = RING_GET_REQUEST(&be_ring, be_ring.req_prod_pvt); memcpy(req_d, req, sizeof(blkif_request_t)); wmb(); be_ring.req_prod_pvt++; + pthread_mutex_unlock(&be_prod_mutex); return 0; } @@ -210,12 +218,14 @@ inline int write_req_to_be_ring(blkif_request_t *req) inline int write_rsp_to_fe_ring(blkif_response_t *rsp) { blkif_response_t *rsp_d; + static pthread_mutex_t fe_prod_mutex = PTHREAD_MUTEX_INITIALIZER; - //rsp_d = BACK_RING_NEXT_EMPTY_RESPONSE(&fe_ring); + pthread_mutex_lock(&fe_prod_mutex); rsp_d = RING_GET_RESPONSE(&fe_ring, fe_ring.rsp_prod_pvt); memcpy(rsp_d, rsp, sizeof(blkif_response_t)); wmb(); fe_ring.rsp_prod_pvt++; + pthread_mutex_unlock(&fe_prod_mutex); return 0; } @@ -336,6 +346,10 @@ int blktap_listen(void) ctrl_sring_t *csring; RING_IDX rp, i, pfd_count; + /* pending rings */ + blkif_request_t req_pending[BLKIF_RING_SIZE]; + blkif_response_t rsp_pending[BLKIF_RING_SIZE]; + /* handler hooks: */ request_hook_t *req_hook; response_hook_t *rsp_hook; @@ -447,6 +461,8 @@ int blktap_listen(void) int done = 0; /* stop forwarding this request */ req = RING_GET_REQUEST(&fe_ring, i); + memcpy(&req_pending[ID_TO_IDX(req->id)], req, sizeof(*req)); + req = &req_pending[ID_TO_IDX(req->id)]; DPRINTF("copying an fe request\n"); @@ -487,6 +503,8 @@ int blktap_listen(void) { rsp = RING_GET_RESPONSE(&be_ring, i); + memcpy(&rsp_pending[ID_TO_IDX(rsp->id)], rsp, sizeof(*rsp)); + rsp = &rsp_pending[ID_TO_IDX(rsp->id)]; DPRINTF("copying a be request\n"); diff --git a/tools/blktap/blockstore-tls.c b/tools/blktap/blockstore-tls.c new file mode 100644 index 0000000000..67808d7c28 --- /dev/null +++ b/tools/blktap/blockstore-tls.c @@ -0,0 +1,161 @@ +/************************************************************************** + * + * blockstore.c + * + * Simple block store interface + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include "blockstore.h" +#include "parallax-threaded.h" + +/*static int block_fp = -1;*/ + +static int fd_list[READ_POOL_SIZE+1]; + +/** + * readblock: read a block from disk + * @id: block id to read + * + * @return: pointer to block, NULL on error + */ + +void *readblock(u64 id) +{ + void *block; + int tid = (int)pthread_getspecific(tid_key); + + if (lseek64(fd_list[tid], ((off64_t) id - 1LL) * BLOCK_SIZE, SEEK_SET) < 0) { + printf ("%Ld\n", (id - 1) * BLOCK_SIZE); + perror("readblock lseek"); + goto err; + } + if ((block = malloc(BLOCK_SIZE)) == NULL) { + perror("readblock malloc"); + goto err; + } + if (read(fd_list[tid], block, BLOCK_SIZE) != BLOCK_SIZE) { + perror("readblock read"); + free(block); + goto err; + } + return block; + +err: + return NULL; +} + +/** + * writeblock: write an existing block to disk + * @id: block id + * @block: pointer to block + * + * @return: zero on success, -1 on failure + */ +int writeblock(u64 id, void *block) +{ + int tid = (int)pthread_getspecific(tid_key); + + if (lseek64(fd_list[tid], ((off64_t) id - 1LL) * BLOCK_SIZE, SEEK_SET) < 0) { + perror("writeblock lseek"); + goto err; + } + if (write(fd_list[tid], block, BLOCK_SIZE) < 0) { + perror("writeblock write"); + goto err; + } + return 0; + +err: + return -1; +} + +/** + * allocblock: write a new block to disk + * @block: pointer to block + * + * @return: new id of block on disk + */ + +u64 allocblock(void *block) +{ + u64 lb; + off64_t pos; + int tid = (int)pthread_getspecific(tid_key); + + pos = lseek64(fd_list[tid], 0, SEEK_END); + if (pos == (off64_t)-1) { + perror("allocblock lseek"); + goto err; + } + if (pos % BLOCK_SIZE != 0) { + fprintf(stderr, "file size not multiple of %d\n", BLOCK_SIZE); + goto err; + } + if (write(fd_list[tid], block, BLOCK_SIZE) != BLOCK_SIZE) { + perror("allocblock write"); + goto err; + } + lb = pos / BLOCK_SIZE + 1; + + return lb; + +err: + return 0; + +} + + +/** + * newblock: get a new in-memory block set to zeros + * + * @return: pointer to new block, NULL on error + */ +void *newblock() +{ + void *block = malloc(BLOCK_SIZE); + if (block == NULL) { + perror("newblock"); + return NULL; + } + memset(block, 0, BLOCK_SIZE); + return block; +} + + +/** + * freeblock: unallocate an in-memory block + * @id: block id (zero if this is only in-memory) + * @block: block to be freed + */ +void freeblock(void *block) +{ + if (block != NULL) + free(block); +} + + +int __init_blockstore(void) +{ + int i; + + for (i=0; i<(READ_POOL_SIZE+1); i++) { + + fd_list[i] = open("blockstore.dat", + O_RDWR | O_CREAT | O_LARGEFILE, 0644); + + if (fd_list[i] < 0) { + perror("open"); + return -1; + } + } + return 0; +} diff --git a/tools/blktap/blockstore.c b/tools/blktap/blockstore.c index 179fcdc3c3..6445cfd324 100644 --- a/tools/blktap/blockstore.c +++ b/tools/blktap/blockstore.c @@ -14,8 +14,11 @@ #include #include #include "blockstore.h" +#include "parallax-threaded.h" -static int block_fp = -1; +/*static int block_fp = -1;*/ + +static int fd_list[READ_POOL_SIZE+1]; /** * readblock: read a block from disk @@ -26,21 +29,36 @@ static int block_fp = -1; void *readblock(u64 id) { void *block; + int block_fp; + + block_fp = open("blockstore.dat", O_RDONLY | O_CREAT | O_LARGEFILE, 0644); + + if (block_fp < 0) { + perror("open"); + return NULL; + } + if (lseek64(block_fp, ((off64_t) id - 1LL) * BLOCK_SIZE, SEEK_SET) < 0) { + printf ("%Ld ", id); printf ("%Ld\n", (id - 1) * BLOCK_SIZE); perror("readblock lseek"); - return NULL; + goto err; } if ((block = malloc(BLOCK_SIZE)) == NULL) { perror("readblock malloc"); - return NULL; + goto err; } if (read(block_fp, block, BLOCK_SIZE) != BLOCK_SIZE) { perror("readblock read"); free(block); - return NULL; + goto err; } + close(block_fp); return block; + +err: + close(block_fp); + return NULL; } /** @@ -51,15 +69,30 @@ void *readblock(u64 id) { * @return: zero on success, -1 on failure */ int writeblock(u64 id, void *block) { + + int block_fp; + + block_fp = open("blockstore.dat", O_RDWR | O_CREAT | O_LARGEFILE, 0644); + + if (block_fp < 0) { + perror("open"); + return -1; + } + if (lseek64(block_fp, ((off64_t) id - 1LL) * BLOCK_SIZE, SEEK_SET) < 0) { perror("writeblock lseek"); - return -1; + goto err; } if (write(block_fp, block, BLOCK_SIZE) < 0) { perror("writeblock write"); - return -1; + goto err; } + close(block_fp); return 0; + +err: + close(block_fp); + return -1; } /** @@ -68,30 +101,41 @@ int writeblock(u64 id, void *block) { * * @return: new id of block on disk */ -static u64 lastblock = 0; u64 allocblock(void *block) { u64 lb; - off64_t pos = lseek64(block_fp, 0, SEEK_END); + off64_t pos; + int block_fp; + + block_fp = open("blockstore.dat", O_RDWR | O_CREAT | O_LARGEFILE, 0644); + + if (block_fp < 0) { + perror("open"); + return 0; + } + + pos = lseek64(block_fp, 0, SEEK_END); if (pos == (off64_t)-1) { perror("allocblock lseek"); - return 0; + goto err; } if (pos % BLOCK_SIZE != 0) { fprintf(stderr, "file size not multiple of %d\n", BLOCK_SIZE); - return 0; + goto err; } if (write(block_fp, block, BLOCK_SIZE) != BLOCK_SIZE) { perror("allocblock write"); - return 0; + goto err; } lb = pos / BLOCK_SIZE + 1; +//printf("alloc(%Ld)\n", lb); + close(block_fp); + return lb; - if (lb <= lastblock) - printf("[*** %Ld alredy allocated! ***]\n", lb); +err: + close(block_fp); + return 0; - lastblock = lb; - return lb; } @@ -121,15 +165,157 @@ void freeblock(void *block) { free(block); } +static freeblock_t *new_freeblock(void) +{ + freeblock_t *fb; + + fb = newblock(); + + if (fb == NULL) return NULL; + + fb->magic = FREEBLOCK_MAGIC; + fb->next = 0ULL; + fb->count = 0ULL; + memset(fb->list, 0, sizeof fb->list); + + return fb; +} + +void releaseblock(u64 id) +{ + blockstore_super_t *bs_super; + freeblock_t *fl_current; + + /* get superblock */ + bs_super = (blockstore_super_t *) readblock(BLOCKSTORE_SUPER); + + /* get freeblock_current */ + if (bs_super->freelist_current == 0ULL) + { + fl_current = new_freeblock(); + bs_super->freelist_current = allocblock(fl_current); + writeblock(BLOCKSTORE_SUPER, bs_super); + } else { + fl_current = readblock(bs_super->freelist_current); + } + + /* if full, chain to superblock and allocate new current */ + + if (fl_current->count == FREEBLOCK_SIZE) { + fl_current->next = bs_super->freelist_full; + writeblock(bs_super->freelist_current, fl_current); + bs_super->freelist_full = bs_super->freelist_current; + freeblock(fl_current); + fl_current = new_freeblock(); + bs_super->freelist_current = allocblock(fl_current); + writeblock(BLOCKSTORE_SUPER, bs_super); + } + + /* append id to current */ + fl_current->list[fl_current->count++] = id; + writeblock(bs_super->freelist_current, fl_current); + + freeblock(fl_current); + freeblock(bs_super); + + +} + +/* freelist debug functions: */ +void freelist_count(int print_each) +{ + blockstore_super_t *bs_super; + freeblock_t *fb; + u64 total = 0, next; + + bs_super = (blockstore_super_t *) readblock(BLOCKSTORE_SUPER); + + if (bs_super->freelist_current == 0ULL) { + printf("freelist is empty!\n"); + return; + } + + fb = readblock(bs_super->freelist_current); + printf("%Ld entires on current.\n", fb->count); + total += fb->count; + if (print_each == 1) + { + int i; + for (i=0; i< fb->count; i++) + printf(" %Ld\n", fb->list[i]); + } + + freeblock(fb); + + if (bs_super->freelist_full == 0ULL) { + printf("freelist_full is empty!\n"); + return; + } + + next = bs_super->freelist_full; + for (;;) { + fb = readblock(next); + total += fb->count; + if (print_each == 1) + { + int i; + for (i=0; i< fb->count; i++) + printf(" %Ld\n", fb->list[i]); + } + next = fb->next; + freeblock(fb); + if (next == 0ULL) break; + } + printf("Total of %Ld ids on freelist.\n", total); +} int __init_blockstore(void) { + int i; + blockstore_super_t *bs_super; + u64 ret; + int block_fp; + block_fp = open("blockstore.dat", O_RDWR | O_CREAT | O_LARGEFILE, 0644); if (block_fp < 0) { perror("open"); - return -1; + exit(-1); } + if (lseek(block_fp, 0, SEEK_END) == 0) { + bs_super = newblock(); + bs_super->magic = BLOCKSTORE_MAGIC; + bs_super->freelist_full = 0LL; + bs_super->freelist_current = 0LL; + + ret = allocblock(bs_super); + + freeblock(bs_super); + } else { + bs_super = (blockstore_super_t *) readblock(BLOCKSTORE_SUPER); + if (bs_super->magic != BLOCKSTORE_MAGIC) + { + printf("BLOCKSTORE IS CORRUPT! (no magic in superblock!)\n"); + exit(-1); + } + freeblock(bs_super); + } + + close(block_fp); + + + /* + for (i=0; i<(READ_POOL_SIZE+1); i++) { + + fd_list[i] = open("blockstore.dat", + O_RDWR | O_CREAT | O_LARGEFILE, 0644); + + if (fd_list[i] < 0) { + perror("open"); + return -1; + } + } + */ return 0; } diff --git a/tools/blktap/blockstore.h b/tools/blktap/blockstore.h index 0e531c5ab4..17eb22b45d 100644 --- a/tools/blktap/blockstore.h +++ b/tools/blktap/blockstore.h @@ -20,12 +20,38 @@ #define SECTOR_SHIFT 9 #endif +#define FREEBLOCK_SIZE (BLOCK_SIZE / sizeof(u64)) - (3 * sizeof(u64)) +#define FREEBLOCK_MAGIC 0x0fee0fee0fee0fee + +typedef struct { + u64 magic; + u64 next; + u64 count; + u64 list[FREEBLOCK_SIZE]; +} freeblock_t; + +#define BLOCKSTORE_MAGIC 0xaaaaaaa00aaaaaaa +#define BLOCKSTORE_SUPER 1ULL + +typedef struct { + u64 magic; + u64 freelist_full; + u64 freelist_current; +} blockstore_super_t; extern void *newblock(); extern void *readblock(u64 id); extern u64 allocblock(void *block); extern int writeblock(u64 id, void *block); + +/* Add this blockid to a freelist, to be recycled by the allocator. */ +extern void releaseblock(u64 id); + +/* this is a memory free() operation for block-sized allocations */ extern void freeblock(void *block); extern int __init_blockstore(void); +/* debug for freelist. */ +void freelist_count(int print_each); + #endif /* __BLOCKSTORE_H__ */ diff --git a/tools/blktap/parallax-threaded.c b/tools/blktap/parallax-threaded.c new file mode 100644 index 0000000000..25b80dea16 --- /dev/null +++ b/tools/blktap/parallax-threaded.c @@ -0,0 +1,654 @@ +/************************************************************************** + * + * parallax.c + * + * The Parallax Storage Server + * + */ + + +#include +#include +#include +#include +#include "blktaplib.h" +#include "blockstore.h" +#include "vdi.h" +#include "parallax-threaded.h" + +#define PARALLAX_DEV 61440 + + +#if 0 +#define DPRINTF(_f, _a...) printf ( _f , ## _a ) +#else +#define DPRINTF(_f, _a...) ((void)0) +#endif + +/* ------[ session records ]----------------------------------------------- */ + +#define BLKIF_HASHSZ 1024 +#define BLKIF_HASH(_d,_h) (((int)(_d)^(int)(_h))&(BLKIF_HASHSZ-1)) + +#define VDI_HASHSZ 16 +#define VDI_HASH(_vd) ((((_vd)>>8)^(_vd))&(VDI_HASHSZ-1)) + +typedef struct blkif { + domid_t domid; + unsigned int handle; + enum { DISCONNECTED, DISCONNECTING, CONNECTED } status; + vdi_t *vdi_hash[VDI_HASHSZ]; + struct blkif *hash_next; +} blkif_t; + +static blkif_t *blkif_hash[BLKIF_HASHSZ]; + +blkif_t *blkif_find_by_handle(domid_t domid, unsigned int handle) +{ + if ( handle != 0 ) + printf("blktap/parallax don't currently support non-0 dev handles!\n"); + + blkif_t *blkif = blkif_hash[BLKIF_HASH(domid, handle)]; + while ( (blkif != NULL) && + ((blkif->domid != domid) || (blkif->handle != handle)) ) + blkif = blkif->hash_next; + return blkif; +} + +vdi_t *blkif_get_vdi(blkif_t *blkif, blkif_vdev_t device) +{ + vdi_t *vdi = blkif->vdi_hash[VDI_HASH(device)]; + + while ((vdi != NULL) && (vdi->vdevice != device)) + vdi = vdi->next; + + return vdi; +} + +/* ------[ control message handling ]-------------------------------------- */ + +void blkif_create(blkif_be_create_t *create) +{ + domid_t domid = create->domid; + unsigned int handle = create->blkif_handle; + blkif_t **pblkif, *blkif; + + DPRINTF("parallax (blkif_create): create is %p\n", create); + + if ( (blkif = (blkif_t *)malloc(sizeof(blkif_t))) == NULL ) + { + DPRINTF("Could not create blkif: out of memory\n"); + create->status = BLKIF_BE_STATUS_OUT_OF_MEMORY; + return; + } + + memset(blkif, 0, sizeof(*blkif)); + blkif->domid = domid; + blkif->handle = handle; + blkif->status = DISCONNECTED; +/* + spin_lock_init(&blkif->vbd_lock); + spin_lock_init(&blkif->blk_ring_lock); + atomic_set(&blkif->refcnt, 0); +*/ + pblkif = &blkif_hash[BLKIF_HASH(domid, handle)]; + while ( *pblkif != NULL ) + { + if ( ((*pblkif)->domid == domid) && ((*pblkif)->handle == handle) ) + { + DPRINTF("Could not create blkif: already exists\n"); + create->status = BLKIF_BE_STATUS_INTERFACE_EXISTS; + free(blkif); + return; + } + pblkif = &(*pblkif)->hash_next; + } + + blkif->hash_next = *pblkif; + *pblkif = blkif; + + DPRINTF("Successfully created blkif\n"); + create->status = BLKIF_BE_STATUS_OKAY; +} + +void blkif_destroy(blkif_be_destroy_t *destroy) +{ + domid_t domid = destroy->domid; + unsigned int handle = destroy->blkif_handle; + blkif_t **pblkif, *blkif; + + DPRINTF("parallax (blkif_destroy): destroy is %p\n", destroy); + + pblkif = &blkif_hash[BLKIF_HASH(domid, handle)]; + while ( (blkif = *pblkif) != NULL ) + { + if ( (blkif->domid == domid) && (blkif->handle == handle) ) + { + if ( blkif->status != DISCONNECTED ) + goto still_connected; + goto destroy; + } + pblkif = &blkif->hash_next; + } + + destroy->status = BLKIF_BE_STATUS_INTERFACE_NOT_FOUND; + return; + + still_connected: + destroy->status = BLKIF_BE_STATUS_INTERFACE_CONNECTED; + return; + + destroy: + *pblkif = blkif->hash_next; + /* destroy_all_vbds(blkif); */ + free(blkif); + destroy->status = BLKIF_BE_STATUS_OKAY; +} + +void vbd_grow(blkif_be_vbd_grow_t *grow) +{ + blkif_t *blkif; + vdi_t *vdi, **vdip; + blkif_vdev_t vdevice = grow->vdevice; + + DPRINTF("parallax (vbd_grow): grow=%p\n", grow); + + blkif = blkif_find_by_handle(grow->domid, grow->blkif_handle); + if ( blkif == NULL ) + { + DPRINTF("vbd_grow attempted for non-existent blkif (%u,%u)\n", + grow->domid, grow->blkif_handle); + grow->status = BLKIF_BE_STATUS_INTERFACE_NOT_FOUND; + return; + } + + /* VDI identifier is in grow->extent.sector_start */ + DPRINTF("vbd_grow: grow->extent.sector_start (id) is %llx\n", + grow->extent.sector_start); + + vdi = vdi_get(grow->extent.sector_start); + if (vdi == NULL) + { + printf("parallax (vbd_grow): VDI %llx not found.\n", + grow->extent.sector_start); + grow->status = BLKIF_BE_STATUS_VBD_NOT_FOUND; + return; + } + + vdi->next = NULL; + vdi->vdevice = vdevice; + vdip = &blkif->vdi_hash[VDI_HASH(vdevice)]; + while (*vdip != NULL) + vdip = &(*vdip)->next; + *vdip = vdi; + + DPRINTF("vbd_grow: happy return!\n"); + grow->status = BLKIF_BE_STATUS_OKAY; +} + +int parallax_control(control_msg_t *msg) +{ + domid_t domid; + int ret; + + DPRINTF("parallax_control: msg is %p\n", msg); + + if (msg->type != CMSG_BLKIF_BE) + { + printf("Unexpected control message (%d)\n", msg->type); + return 0; + } + + switch(msg->subtype) + { + case CMSG_BLKIF_BE_CREATE: + if ( msg->length != sizeof(blkif_be_create_t) ) + goto parse_error; + blkif_create((blkif_be_create_t *)msg->msg); + break; + + case CMSG_BLKIF_BE_DESTROY: + if ( msg->length != sizeof(blkif_be_destroy_t) ) + goto parse_error; + blkif_destroy((blkif_be_destroy_t *)msg->msg); + break; + + case CMSG_BLKIF_BE_VBD_GROW: + if ( msg->length != sizeof(blkif_be_vbd_grow_t) ) + goto parse_error; + vbd_grow((blkif_be_vbd_grow_t *)msg->msg); + break; + } + return 0; +parse_error: + printf("Bad control message!\n"); + return 0; + +} + +int parallax_probe(blkif_request_t *req, blkif_t *blkif) +{ + blkif_response_t *rsp; + vdisk_t *img_info; + vdi_t *vdi; + int i, nr_vdis = 0; + + DPRINTF("parallax_probe: req=%p, blkif=%p\n", req, blkif); + + /* We expect one buffer only. */ + if ( req->nr_segments != 1 ) + goto err; + + /* Make sure the buffer is page-sized. */ + if ( (blkif_first_sect(req->frame_and_sects[0]) != 0) || + (blkif_last_sect (req->frame_and_sects[0]) != 7) ) + goto err; + + /* fill the list of devices */ + for (i=0; ivdi_hash[i]; + while (vdi) { + img_info = (vdisk_t *)MMAP_VADDR(ID_TO_IDX(req->id), 0); + img_info[nr_vdis].device = vdi->vdevice; + img_info[nr_vdis].info = VDISK_TYPE_DISK | VDISK_FLAG_VIRT; + /* The -2 here accounts for the LSB in the radix tree */ + img_info[nr_vdis].capacity = + ((1LL << (VDI_HEIGHT-2)) >> SECTOR_SHIFT); + nr_vdis++; + vdi = vdi->next; + } + } + + + rsp = (blkif_response_t *)req; + rsp->id = req->id; + rsp->operation = BLKIF_OP_PROBE; + rsp->status = nr_vdis; /* number of disks */ + + DPRINTF("parallax_probe: send positive response (nr_vdis=%d)\n", nr_vdis); + return BLKTAP_RESPOND; +err: + rsp = (blkif_response_t *)req; + rsp->id = req->id; + rsp->operation = BLKIF_OP_PROBE; + rsp->status = BLKIF_RSP_ERROR; + + DPRINTF("parallax_probe: send error response\n"); + return BLKTAP_RESPOND; +} + +typedef struct { + blkif_request_t *req; + int count; + pthread_mutex_t mutex; +} pending_t; + +#define MAX_REQUESTS 64 +pending_t pending_list[MAX_REQUESTS]; + +typedef struct { + vdi_t *vdi; + blkif_request_t *req; + int segment; + pending_t *pent; +} readseg_params_t; + +#define DISPATCH_SIZE 1024UL +#define DISPATCH_MASK (DISPATCH_SIZE-1) +readseg_params_t dispatch_list[DISPATCH_SIZE]; +unsigned long dprod = 0, dcons = 0; +pthread_mutex_t dispatch_mutex; +pthread_cond_t dispatch_cond; + +void *read_segment(void *param) +{ + readseg_params_t *p; + u64 vblock, gblock, sector; + char *dpage, *spage; + unsigned long size, start, offset; + blkif_response_t *rsp; + int tid; + +unsigned long dc, dp; + +#ifdef NOTHREADS +#else + /* Set this thread's tid. */ + tid = *(int *)param; + free(param); + + pthread_setspecific(tid_key, (void *)tid); + + printf("My tid is %d.\n", (int)pthread_getspecific(tid_key)); +start: + pthread_mutex_lock(&dispatch_mutex); + while (dprod == dcons) + pthread_cond_wait(&dispatch_cond, &dispatch_mutex); + + if (dprod == dcons) { + /* unnecessary wakeup. */ + pthread_mutex_unlock(&dispatch_mutex); + goto start; + } +#endif +dc = dcons; +dp = dprod; + + p = &dispatch_list[dcons & DISPATCH_MASK]; + dcons++; +#ifdef NOTHREADS +#else + pthread_mutex_unlock(&dispatch_mutex); +#endif + dpage = (char *)MMAP_VADDR(ID_TO_IDX(p->req->id), p->segment); + + /* Round the requested segment to a block address. */ + + sector = p->req->sector_number + (8*p->segment); + vblock = (sector << SECTOR_SHIFT) >> BLOCK_SHIFT; + + /* Get that block from the store. */ + + gblock = vdi_lookup_block(p->vdi, vblock, NULL); + + /* Calculate read size and offset within the read block. */ + + offset = (sector << SECTOR_SHIFT) % BLOCK_SIZE; + size = ( blkif_last_sect (p->req->frame_and_sects[p->segment]) - + blkif_first_sect(p->req->frame_and_sects[p->segment]) + 1 + ) << SECTOR_SHIFT; + start = blkif_first_sect(p->req->frame_and_sects[p->segment]) + << SECTOR_SHIFT; + + /* If the block does not exist in the store, return zeros. */ + /* Otherwise, copy that region to the guest page. */ + +// printf(" : (%p, %d, %d) (%d) [c:%lu,p:%lu]\n", +// p->req, ID_TO_IDX(p->req->id), p->segment, +// p->pent->count, dc, dp); + + DPRINTF("ParallaxRead: sect: %lld (%ld,%ld), " + "vblock %llx, gblock %llx, " + "size %lx\n", + sector, blkif_first_sect(p->req->frame_and_sects[p->segment]), + blkif_last_sect (p->req->frame_and_sects[p->segment]), + vblock, gblock, size); + + if ( gblock == 0 ) { + + memset(dpage + start, '\0', size); + + } else { + + spage = readblock(gblock); + + if (spage == NULL) { + printf("Error reading gblock from store: %Ld\n", gblock); + goto err; + } + + memcpy(dpage + start, spage + offset, size); + + freeblock(spage); + } + + + /* Done the read. Now update the pending record. */ + + pthread_mutex_lock(&p->pent->mutex); + p->pent->count--; + + if (p->pent->count == 0) { + +// printf("FINISH: (%d, %d)\n", ID_TO_IDX(p->req->id), p->segment); + rsp = (blkif_response_t *)p->req; + rsp->id = p->req->id; + rsp->operation = BLKIF_OP_READ; + rsp->status = BLKIF_RSP_OKAY; + + blktap_inject_response(rsp); + } + + pthread_mutex_unlock(&p->pent->mutex); + +#ifdef NOTHREADS + return NULL; +#else + goto start; +#endif + +err: + printf("I am screwed!\n"); +#ifdef NOTHREADS + return NULL; +#else + goto start; +#endif +} + + +int parallax_read(blkif_request_t *req, blkif_t *blkif) +{ + blkif_response_t *rsp; + unsigned long size, offset, start; + u64 sector; + u64 vblock, gblock; + vdi_t *vdi; + int i; + char *dpage, *spage; + pending_t *pent; + readseg_params_t *params; + + vdi = blkif_get_vdi(blkif, req->device); + + if ( vdi == NULL ) + goto err; + +// printf("START : (%p, %d, %d)\n", req, ID_TO_IDX(req->id), req->nr_segments); + + pent = &pending_list[ID_TO_IDX(req->id)]; + pent->count = req->nr_segments; + pent->req = req; + pthread_mutex_init(&pent->mutex, NULL); + + + for (i = 0; i < req->nr_segments; i++) { + pthread_t tid; + int ret; + + params = &dispatch_list[dprod & DISPATCH_MASK]; + params->pent = pent; + params->vdi = vdi; + params->req = req; + params->segment = i; + wmb(); + dprod++; + + pthread_mutex_lock(&dispatch_mutex); + pthread_cond_signal(&dispatch_cond); + pthread_mutex_unlock(&dispatch_mutex); +#ifdef NOTHREADS + read_segment(NULL); +#endif + + } + + + + + return BLKTAP_STOLEN; + +err: + rsp = (blkif_response_t *)req; + rsp->id = req->id; + rsp->operation = BLKIF_OP_READ; + rsp->status = BLKIF_RSP_ERROR; + + return BLKTAP_RESPOND; +} + +int parallax_write(blkif_request_t *req, blkif_t *blkif) +{ + blkif_response_t *rsp; + u64 sector; + int i, writable = 0; + u64 vblock, gblock; + char *spage; + unsigned long size, offset, start; + vdi_t *vdi; + + vdi = blkif_get_vdi(blkif, req->device); + + if ( vdi == NULL ) + goto err; + + for (i = 0; i < req->nr_segments; i++) { + + spage = (char *)MMAP_VADDR(ID_TO_IDX(req->id), i); + + /* Round the requested segment to a block address. */ + + sector = req->sector_number + (8*i); + vblock = (sector << SECTOR_SHIFT) >> BLOCK_SHIFT; + + /* Get that block from the store. */ + + gblock = vdi_lookup_block(vdi, vblock, &writable); + + /* Calculate read size and offset within the read block. */ + + offset = (sector << SECTOR_SHIFT) % BLOCK_SIZE; + size = ( blkif_last_sect (req->frame_and_sects[i]) - + blkif_first_sect(req->frame_and_sects[i]) + 1 + ) << SECTOR_SHIFT; + start = blkif_first_sect(req->frame_and_sects[i]) << SECTOR_SHIFT; + + DPRINTF("ParallaxWrite: sect: %lld (%ld,%ld), " + "vblock %llx, gblock %llx, " + "size %lx\n", + sector, blkif_first_sect(req->frame_and_sects[i]), + blkif_last_sect (req->frame_and_sects[i]), + vblock, gblock, size); + + /* XXX: For now we just freak out if they try to write a */ + /* non block-sized, block-aligned page. */ + + if ((offset != 0) || (size != BLOCK_SIZE) || (start != 0)) { + printf("]\n] STRANGE WRITE!\n]\n"); + goto err; + } + + if (( gblock == 0 ) || ( writable == 0 )) { + + gblock = allocblock(spage); + vdi_update_block(vdi, vblock, gblock); + + } else { + + /* write-in-place, no need to change mappings. */ + writeblock(gblock, spage); + + } + + } + + rsp = (blkif_response_t *)req; + rsp->id = req->id; + rsp->operation = BLKIF_OP_WRITE; + rsp->status = BLKIF_RSP_OKAY; + + return BLKTAP_RESPOND; +err: + rsp = (blkif_response_t *)req; + rsp->id = req->id; + rsp->operation = BLKIF_OP_WRITE; + rsp->status = BLKIF_RSP_ERROR; + + return BLKTAP_RESPOND; +} + +int parallax_request(blkif_request_t *req) +{ + blkif_response_t *rsp; + domid_t dom = ID_TO_DOM(req->id); + blkif_t *blkif = blkif_find_by_handle(dom, 0); + + //DPRINTF("parallax_request: req=%p, dom=%d, blkif=%p\n", req, dom, blkif); + + if (blkif == NULL) + goto err; + + if ( req->operation == BLKIF_OP_PROBE ) { + + return parallax_probe(req, blkif); + + } else if ( req->operation == BLKIF_OP_READ ) { + + return parallax_read(req, blkif); + + } else if ( req->operation == BLKIF_OP_WRITE ) { + + return parallax_write(req, blkif); + + } else { + /* Unknown operation */ + goto err; + } + +err: + rsp = (blkif_response_t *)req; + rsp->id = req->id; + rsp->operation = req->operation; + rsp->status = BLKIF_RSP_ERROR; + return BLKTAP_RESPOND; +} + +void __init_parallax(void) +{ + memset(blkif_hash, 0, sizeof(blkif_hash)); +} + + + +int main(int argc, char *argv[]) +{ + pthread_t read_pool[READ_POOL_SIZE]; + int i, tid=0; + + DPRINTF("parallax: starting.\n"); + __init_blockstore(); + DPRINTF("parallax: initialized blockstore...\n"); + __init_vdi(); + DPRINTF("parallax: initialized vdi registry etc...\n"); + __init_parallax(); + DPRINTF("parallax: initialized local stuff..\n"); + + + pthread_mutex_init(&dispatch_mutex, NULL); + pthread_cond_init(&dispatch_cond, NULL); + + pthread_key_create(&tid_key, NULL); + tid = 0; + +#ifdef NOTHREADS +#else + for (i=0; i < READ_POOL_SIZE; i++) { + int ret, *t; + t = (int *)malloc(sizeof(int)); + *t = tid++; + ret = pthread_create(&read_pool[i], NULL, read_segment, t); + if (ret != 0) printf("Error starting thread %d\n", i); + } +#endif + + pthread_setspecific(tid_key, (void *)tid); + + printf("*My tid is %d.\n", (int)pthread_getspecific(tid_key)); + + blktap_register_ctrl_hook("parallax_control", parallax_control); + blktap_register_request_hook("parallax_request", parallax_request); + DPRINTF("parallax: added ctrl + request hooks, starting listen...\n"); + blktap_listen(); + + return 0; +} diff --git a/tools/blktap/parallax-threaded.h b/tools/blktap/parallax-threaded.h new file mode 100644 index 0000000000..17cdcb983e --- /dev/null +++ b/tools/blktap/parallax-threaded.h @@ -0,0 +1,23 @@ +/************************************************************************** + * + * parallax-threaded.h + * + * a few thread-specific defines + * + */ + +#ifndef __PARALLAX_THREADED_H__ +#define __PARALLAX_THREADED_H__ + +#if 0 +/* Turn off threading. */ +#define NOTHREADS +#endif + +#define READ_POOL_SIZE 128 + +/* per-thread identifier */ +pthread_key_t tid_key; + +#endif /* __PARALLAX_THREADED_H__ */ + diff --git a/tools/blktap/parallax.c b/tools/blktap/parallax.c index 15db3ece32..9c853bc035 100644 --- a/tools/blktap/parallax.c +++ b/tools/blktap/parallax.c @@ -16,7 +16,7 @@ #define PARALLAX_DEV 61440 -#if 1 +#if 0 #define DPRINTF(_f, _a...) printf ( _f , ## _a ) #else #define DPRINTF(_f, _a...) ((void)0) @@ -342,14 +342,14 @@ int parallax_read(blkif_request_t *req, blkif_t *blkif) rsp = (blkif_response_t *)req; rsp->id = req->id; - rsp->operation = BLKIF_OP_WRITE; + rsp->operation = BLKIF_OP_READ; rsp->status = BLKIF_RSP_OKAY; return BLKTAP_RESPOND; err: rsp = (blkif_response_t *)req; rsp->id = req->id; - rsp->operation = BLKIF_OP_WRITE; + rsp->operation = BLKIF_OP_READ; rsp->status = BLKIF_RSP_ERROR; return BLKTAP_RESPOND; diff --git a/tools/blktap/radix.c b/tools/blktap/radix.c index 9c3a773353..c81af26959 100644 --- a/tools/blktap/radix.c +++ b/tools/blktap/radix.c @@ -219,7 +219,78 @@ u64 snapshot(u64 root) { return writable(root); } -void print_root(u64 root, int height, u64 val, FILE *dot_f) +/** + * collapse: collapse a parent onto a child. + * + * NOTE: This assumes that parent and child really are, and further that + * there are no other children forked from this parent. (children of the + * child are okay...) + */ + +int collapse(int height, u64 proot, u64 croot) +{ + int i, numlinks, ret, total = 0; + radix_tree_node pnode, cnode; + +//printf("proot: %Ld\n", getid(proot)); + if (height == 0) { + height = -1; /* terminate recursion */ + } else { + height = ((height - 1) / RADIX_TREE_MAP_SHIFT) * RADIX_TREE_MAP_SHIFT; + } + numlinks = (1UL << RADIX_TREE_MAP_SHIFT); + + /* Terminal cases: */ + + if ( (getid(proot) == ZERO) || (getid(croot) == ZERO) ) + return -1; + + /* get roots */ + if ((pnode = readblock(getid(proot))) == NULL) + return -1; + + if ((cnode = readblock(getid(croot))) == NULL) + { + freeblock(pnode); + return -1; + } + + /* For each writable link in proot */ + for (i=0; i= 0 ) && ( iswritable(pnode[i]) ) ) + { + //printf(" %Ld is writable (i=%d).\n", getid(pnode[i]), i); + ret = collapse(height, pnode[i], cnode[i]); + if (ret == -1) + { + total = -1; + } else { + total += ret; + } + } + + + } + + /* if plink is writable, AND clink is writable -> free plink block */ + if ( ( iswritable(proot) ) && ( iswritable(croot) ) ) + { + releaseblock(getid(proot)); + if (ret >=0) total++; + //printf(" Delete %Ld\n", getid(proot)); + } +//printf("done : %Ld\n", getid(proot)); + return total; + +} + + +void print_root(u64 root, int height, FILE *dot_f) { FILE *f; int i; @@ -241,7 +312,9 @@ void print_root(u64 root, int height, u64 val, FILE *dot_f) getid(root), style[iswritable(root)], getid(root)); } - /* base case--return val */ + printf("print_root(%Ld)\n", getid(root)); + + /* base case */ if (height == 0) { /* add a node and edge for each child root */ node = (radix_tree_node) readblock(getid(root)); @@ -249,7 +322,7 @@ void print_root(u64 root, int height, u64 val, FILE *dot_f) return; for (i = 0; i < RADIX_TREE_MAP_ENTRIES; i++) { - if (node[i] != 0) { + if (node[i] != ZERO) { fprintf(f, " n%Ld [%sshape=box,label=\"%Ld\"];\n", getid(node[i]), style[iswritable(node[i])], getid(node[i])); @@ -257,6 +330,7 @@ void print_root(u64 root, int height, u64 val, FILE *dot_f) getid(node[i]), i); } } + freeblock(node); return; } @@ -272,28 +346,17 @@ void print_root(u64 root, int height, u64 val, FILE *dot_f) /* add a node and edge for each child root */ for (i = 0; i < RADIX_TREE_MAP_ENTRIES; i++) - if (node[i] != 0) { + if (node[i] != ZERO) { fprintf(f, " n%Ld [%sshape=box,label=\"%Ld\"];\n", getid(node[i]), style[iswritable(node[i])], getid(node[i])); - print_root(node[i], height-RADIX_TREE_MAP_SHIFT, - val + (((u64)i)< n%Ld [label=\"%d\"]\n", getid(root), getid(node[i]), i); } - - /* - - root = node[(key >> height) & RADIX_TREE_MAP_MASK]; - freeblock(state, getid(oldroot), node); - - if (height == 0) - return root; - - height -= RADIX_TREE_MAP_SHIFT; - */ - //} + freeblock(node); /* write graph postamble */ if (dot_f == NULL) { @@ -306,7 +369,9 @@ void print_root(u64 root, int height, u64 val, FILE *dot_f) int main(int argc, char **argv) { u64 key = ZERO, val = ZERO; - u64 root = writable(ONE); + u64 root = writable(2ULL); + u64 p = ZERO, c = ZERO; + int v; char buff[4096]; __init_blockstore(); @@ -321,18 +386,23 @@ int main(int argc, char **argv) { if (lseek(fp, 0, SEEK_END) == 0) { write(fp, buff, 4096); }*/ - + + allocblock(buff); + printf("Recognized commands:\n" "Note: the LSB of a node number indicates if it is writable\n" " root set root to \n" " snapshot take a snapshot of the root\n" " set set key=val\n" " get query key\n" + " c collapse\n" + " pr print tree to dot\n" + " pf <1=verbose> print freelist\n" " quit\n" "\nroot = %Ld\n", root); for (;;) { - print_root(root, 34, 0, NULL); - system("dot radix.dot -Tps -o radix.ps"); + //print_root(root, 34, NULL); + //system("dot radix.dot -Tps -o radix.ps"); printf("> "); fflush(stdout); @@ -344,8 +414,11 @@ int main(int argc, char **argv) { } else if (sscanf(buff, " set %Ld %Ld", &key, &val) == 2) { root = update(34, root, key, val); printf("root = %Ld\n", root); + } else if (sscanf(buff, " c %Ld %Ld", &p, &c) == 2) { + v = collapse(34, p, c); + printf("reclaimed %d blocks.\n", v); } else if (sscanf(buff, " get %Ld", &key) == 1) { - val = lookup(34, root, key, NULL); + val = lookup(34, root, key); printf("value = %Ld\n", val); } else if (!strcmp(buff, "quit\n")) { break; @@ -353,7 +426,11 @@ int main(int argc, char **argv) { root = snapshot(root); printf("new root = %Ld\n", root); } else if (sscanf(buff, " pr %Ld", &root) == 1) { - print_root(root, 34, 0, NULL); + print_root(root, 34, NULL); + } else if (sscanf(buff, " pf %d", &v) == 1) { + freelist_count(v); + } else if (!strcmp(buff, "pf\n")) { + freelist_count(0); } else { printf("command not recognized\n"); } diff --git a/tools/blktap/radix.h b/tools/blktap/radix.h index 8cca98f7ef..7feaf0c316 100644 --- a/tools/blktap/radix.h +++ b/tools/blktap/radix.h @@ -26,6 +26,7 @@ u64 lookup(int height, u64 root, u64 key); u64 update(int height, u64 root, u64 key, u64 val); u64 snapshot(u64 root); +int collapse(int height, u64 proot, u64 croot); int isprivate(int height, u64 root, u64 key); #endif /* __RADIX_H__ */ diff --git a/tools/blktap/snaplog.c b/tools/blktap/snaplog.c index 0647f1757a..5c030e3b5b 100644 --- a/tools/blktap/snaplog.c +++ b/tools/blktap/snaplog.c @@ -113,6 +113,11 @@ int snap_append(snap_id_t *old_id, snap_rec_t *rec, snap_id_t *new_id) snap_id_t id = *old_id; snap_block_t *blk = snap_get_block(id.block); + if ( rec->deleted == 1 ) { + printf("Attempt to append a deleted snapshot!\n"); + return -1; + } + if ( blk->hdr.immutable != 0 ) { printf("Attempt to snap an immutable snap block!\n"); return -1; @@ -148,6 +153,65 @@ int snap_append(snap_id_t *old_id, snap_rec_t *rec, snap_id_t *new_id) return 0; } +int snap_collapse(int height, snap_id_t *p_id, snap_id_t *c_id) +{ + snap_block_t *p_blk, *c_blk, *blk; + snap_rec_t *p_rec, *c_rec; + int ret = -1; + + p_blk = snap_get_block(p_id->block); + + if (p_blk == NULL) return(-1); + + if (c_id->block == p_id->block) + { + c_blk = p_blk; + } else { + c_blk = snap_get_block(c_id->block); + } + + if (p_blk == NULL) { + freeblock(p_blk); + return(-1); + } + + /* parent and child must not be deleted. */ + p_rec = &p_blk->snaps[p_id->index]; + c_rec = &c_blk->snaps[c_id->index]; + /* + if ( (p_rec->deleted == 1) || (c_rec->deleted == 1) ) { + printf("One of those snaps is already deleted.\n"); + goto done; + } + */ + /* first non-deleted thing in the log before child must be parent. */ + + /* XXX todo: text the range here for delete (and eventually fork) bits) */ + /* for now, snaps must be consecutive, on the same log page: */ + + if ((p_id->block != c_id->block) || (p_id->index != c_id->index-1)) + { + printf("Deleting non-consecutive snaps is not done yet.\n"); + goto done; + } + + /* mark parent as deleted XXX: may need to lock parent block here.*/ + p_rec->deleted = 1; + writeblock(p_id->block, p_blk); + + /* delete the parent */ + printf("collapse(%Ld, %Ld)\n", p_rec->radix_root, c_rec->radix_root); + ret = collapse(height, p_rec->radix_root, c_rec->radix_root); + + /* return the number of blocks reclaimed. */ + +done: + if (c_blk != p_blk) freeblock(c_blk); + freeblock(p_blk); + + return(ret); +} + void snap_print_history(snap_id_t *snap_id) { snap_id_t id = *snap_id; diff --git a/tools/blktap/snaplog.h b/tools/blktap/snaplog.h index 785dccfc66..02748bdedb 100644 --- a/tools/blktap/snaplog.h +++ b/tools/blktap/snaplog.h @@ -5,9 +5,13 @@ * Snapshot log on-disk data structure. * */ - + +#include "radix.h" #include "blockstore.h" /* for BLOCK_SIZE */ +#ifndef __SNAPLOG_H__ +#define __SNAPLOG_H__ + typedef struct snap_id { u64 block; unsigned int index; @@ -16,11 +20,14 @@ typedef struct snap_id { typedef struct snap_rec { u64 radix_root; struct timeval timestamp; + /* flags: */ + unsigned deleted:1; } snap_rec_t; int snap_block_create(snap_id_t *parent_id, snap_id_t *new_id); int snap_append(snap_id_t *id, snap_rec_t *rec, snap_id_t *new_id); +int snap_collapse(int height, snap_id_t *p_id, snap_id_t *c_id); void snap_print_history(snap_id_t *snap_id); int snap_get_id(snap_id_t *id, snap_rec_t *target); @@ -50,3 +57,5 @@ typedef struct snap_block { snap_block_t *snap_get_block(u64 block); + +#endif /* __SNAPLOG_H__ */ diff --git a/tools/blktap/vdi.c b/tools/blktap/vdi.c index e17eb70dac..7c6c63094c 100644 --- a/tools/blktap/vdi.c +++ b/tools/blktap/vdi.c @@ -15,8 +15,8 @@ #include "radix.h" #include "vdi.h" -#define VDI_REG_BLOCK 1LL -#define VDI_RADIX_ROOT writable(2) +#define VDI_REG_BLOCK 2LL +#define VDI_RADIX_ROOT writable(3) #if 1 #define DPRINTF(_f, _a...) printf ( _f , ## _a ) @@ -158,6 +158,7 @@ void vdi_snapshot(vdi_t *vdi) rec.radix_root = vdi->radix_root; gettimeofday(&rec.timestamp, NULL); + rec.deleted = 0; vdi->radix_root = snapshot(vdi->radix_root); ret = snap_append(&vdi->snap, &rec, &vdi->snap); diff --git a/tools/blktap/vdi_snap_delete.c b/tools/blktap/vdi_snap_delete.c new file mode 100644 index 0000000000..0160ccad83 --- /dev/null +++ b/tools/blktap/vdi_snap_delete.c @@ -0,0 +1,48 @@ +/************************************************************************** + * + * vdi_snap_delete.c + * + * Delete a snapshot. + * + * This is not finished: right now it takes a snap n and calls + * snap_collapse(n,n+1). + * + * TODO: support for non-consecutive, non-same-block snaps + * Avoid forking probs. + * + */ + +#include +#include +#include +#include +#include "blockstore.h" +#include "snaplog.h" +#include "radix.h" +#include "vdi.h" + +int main(int argc, char *argv[]) +{ + snap_id_t id, c_id; + int ret; + + __init_blockstore(); + __init_vdi(); + + if ( argc != 3 ) { + printf("usage: %s \n", argv[0]); + exit(-1); + } + + id.block = (u64) atoll(argv[1]); + id.index = (unsigned int) atol (argv[2]); + + c_id = id; + c_id.index++; + + ret = snap_collapse(VDI_HEIGHT, &id, &c_id); + + printf("Freed %d blocks.\n", ret); + + return 0; +} diff --git a/tools/blktap/vdi_snap_list.c b/tools/blktap/vdi_snap_list.c index 32b20a6261..044397495d 100644 --- a/tools/blktap/vdi_snap_list.c +++ b/tools/blktap/vdi_snap_list.c @@ -49,8 +49,10 @@ int main(int argc, char *argv[]) sid = vdi->snap; sid.index--; - //printf("%8s%4s%21s %12s\n", "Block", "idx", "timestamp", "radix root"); - printf("%8s%4s%37s %12s\n", "Block", "idx", "timestamp", "radix root"); + //printf("%8s%4s%21s %12s %1s\n", "Block", "idx", "timestamp", + // "radix root", "d"); + printf("%8s%4s%37s %12s %1s\n", "Block", "idx", "timestamp", + "radix root", "d"); while (sid.block != 0) { blk = snap_get_block(sid.block); @@ -61,13 +63,14 @@ int main(int argc, char *argv[]) } t = ctime(&blk->snaps[i].timestamp.tv_sec); t[strlen(t)-1] = '\0'; - //printf("%8Ld%4u%14lu.%06lu %12Ld\n", - printf("%8Ld%4u%30s %06lu %12Ld\n", + //printf("%8Ld%4u%14lu.%06lu %12Ld %1s\n", + printf("%8Ld%4u%30s %06lu %12Ld %1s\n", sid.block, i, //blk->snaps[i].timestamp.tv_sec, t, blk->snaps[i].timestamp.tv_usec, - blk->snaps[i].radix_root); + blk->snaps[i].radix_root, + blk->snaps[i].deleted ? "*" : " "); if ( max_snaps != -1 ) max_snaps--; } -- 2.30.2